#loading packages and dataset
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.5      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.0 
## ✔ readr   2.1.2      ✔ forcats 0.5.2
## Warning: package 'readr' was built under R version 4.0.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(usmap)
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
covid<-read.csv("data/Provisional_COVID-19_Deaths_by_Sex_and_Age.csv")
#combined New York City with New York as they were originally separated
covid$State[covid$State=="New York City"]<-'New York'

#refactored age ranges into (teenager, adult, senior, child, infant, young adult)
covid$age<-factor(covid$Age.Group, levels=c("infant", "child", "young adult", "adult", "senior", "overall"))
covid$age[covid$Age.Group=="65-74 years"|covid$Age.Group=="75-84 years"|covid$Age.Group=="85 years and over"]<-"senior"
covid$age[covid$Age.Group=="55-64 years"|covid$Age.Group=="30-39 years"|covid$Age.Group=="35-44 years"|covid$Age.Group=="45-54 years"|covid$Age.Group=="40-49 years"|covid$Age.Group=="50-64 years"]<-"adult"
covid$age[covid$Age.Group=="Under 1 year"]<-"infant"
covid$age[covid$Age.Group=="1-4 years"|covid$Age.Group=="0-17 years"|covid$Age.Group=="5-14 years"]<-"child"
covid$age[covid$Age.Group=="18-29 years"|covid$Age.Group=="15-24 years"|covid$Age.Group=="25-34 years"]<-"young adult"
covid$age[covid$Age.Group=="All Ages"]<-"overall"

#created another new dataframe "covid_age" to use for following chart
covid_age<-covid%>%
  filter(State=="United States", Sex=="Male"| Sex=="Female", age!="overall")%>%
  group_by(age, Sex)%>%
  summarize(total_covid_deaths=sum(COVID.19.Deaths, na.rm=T))

#created a bargraph to show the deaths due to covid within each age group & gender
agevsdeath<-ggplot(covid_age, aes(fill=Sex, y=total_covid_deaths, x=age)) + 
  geom_bar(position=position_dodge(), stat="identity")+
  labs(x="Age Group", y="Total Covid Deaths", title="Total Covid Deaths by Age")

In every age group there are more males than females that die to covid, even in infants and children which suggests a genetic component to the differences in covid mortality.

#creating a interactive plot from the ggplot above (covid deaths vs age & gender)
ggplotly(agevsdeath)
#created a new dataframe to base following interactive map on
geo_covid<-covid%>%
  filter(Age.Group=="All Ages", Sex=="Male"|Sex=="Female", State!="United States", Group!="By Total")%>%
  group_by(State,Sex)%>%
  summarize(total_covid_death=sum(COVID.19.Deaths, na.rm=T))%>%
  mutate(code=state.abb[match(State, state.name)])%>%
  pivot_wider(names_from = "Sex", values_from="total_covid_death")%>%
  mutate(percent_male=round(100*Male/(Female+Male)))%>%
  mutate(percent_female=round(100*Female/(Female+Male)))%>%
  mutate(total_death=Female+Male)

The trend that appears is that males are more disportionately affected in the “west” states than the east

#created an interactive map showing number of covid deaths per state throughout the years
plot_geo(data=geo_covid, locationmode='USA-states')%>%
  add_trace(locations=~code, 
            z=~percent_male,
            text=~paste("State:", code, "<br>", 
                        "Female Deaths:", Female, "<br>", 
                        "Male Deaths:", Male, "<br>", 
                        "Percent Male Deaths(%):", percent_male, "<br>", 
                        "Percent Female Deaths(%):", percent_female, "<br>", 
                        "Total Covid Deaths:", total_death))%>%
  colorbar(title="Number of Covid Deaths")%>%
  layout(title="2020-2022 US Covid Deaths by State", geo=list(scope='usa'))

Not a single state has more female deaths than male deaths

#top 10 states with greatest percent male death
geo_covid%>%
  arrange(desc(percent_male))
## # A tibble: 52 × 7
## # Groups:   State [52]
##    State      code  Female   Male percent_male percent_female total_death
##    <chr>      <chr>  <int>  <int>        <dbl>          <dbl>       <int>
##  1 Nevada     NV      9480  14888           61             39       24368
##  2 Utah       UT      4580   7037           61             39       11617
##  3 Hawaii     HI      1417   2168           60             40        3585
##  4 Alaska     AK      1134   1637           59             41        2771
##  5 Arizona    AZ     24831  35431           59             41       60262
##  6 California CA     90060 126321           58             42      216381
##  7 Idaho      ID      4621   6445           58             42       11066
##  8 Texas      TX     88134 119445           58             42      207579
##  9 Colorado   CO     13220  17304           57             43       30524
## 10 Montana    MT      3375   4472           57             43        7847
## # … with 42 more rows
#top 10 states with lowest percent male death
geo_covid%>%
  arrange(percent_male)
## # A tibble: 52 × 7
## # Groups:   State [52]
##    State         code  Female  Male percent_male percent_female total_death
##    <chr>         <chr>  <int> <int>        <dbl>          <dbl>       <int>
##  1 Rhode Island  RI      3973  4051           50             50        8024
##  2 Connecticut   CT     12224 12703           51             49       24927
##  3 Massachusetts MA     20546 21613           51             49       42159
##  4 Delaware      DE      3214  3547           52             48        6761
##  5 Kentucky      KY     19331 20854           52             48       40185
##  6 Maine         ME      3011  3273           52             48        6284
##  7 Mississippi   MS     14304 15301           52             48       29605
##  8 Pennsylvania  PA     50288 55089           52             48      105377
##  9 Arkansas      AR     11692 13414           53             47       25106
## 10 Indiana       IN     25121 28290           53             47       53411
## # … with 42 more rows
#taking 2 states with the greatest percentage of male deaths, 2 with the lowest, and 2 in the middle and placing them into a new dataset
state_age<-covid%>%
  filter(State=="Nevada"|State=="Utah"|State=="Connecticut"|State=="Massachusetts"|State=="South Carolina"|State=="Tennessee", Sex=="Male"| Sex=="Female", age!="overall")%>%
  group_by(age, Sex, State)%>%
  summarize(total_covid_deaths=sum(COVID.19.Deaths, na.rm=T))

It appears that in the states where males have a significantly greater covid mortality percentage, senior males are what accounts for the difference.

#creating mulitple bargraphs to see which age group is most effected in the states with a higher percentage of male covid deaths
plotly_age_state<-ggplot(state_age, aes(fill=Sex, y=total_covid_deaths, x=age)) + 
  geom_bar(position=position_dodge(), stat="identity")+
  facet_wrap(~State)+
  labs(x="Age Group", y="Total Covid Deaths", title="Total Covid Deaths by Age")

ggplotly(plotly_age_state)

Regardless of the season, there are more males that die due to covid than females

#subsetting the months into seasons
covid$seasons<-factor(covid$Month, levels = c("summer", "fall", "spring", "winter"))
covid$seasons[covid$Month<=2|covid$Month==12]<-"winter"
covid$seasons[covid$Month>=6 & covid$Month<=8]<-"summer"
covid$seasons[covid$Month>=9 & covid$Month<=11]<-"fall"
covid$seasons[covid$Month>=3 & covid$Month<=5]<-"spring"

#creating a double bar chart to compare male and female covid deaths in different seasons
covid%>%
  filter(Group=="By Month", Sex!="All Sexes")%>%
  ggplot(aes(x=seasons, y=COVID.19.Deaths, fill=Sex))+
  geom_col(position=position_dodge(), stat="identity")+
  labs(title="Number of All Time Covid Deaths by Season",x="Seasons", y="Number of Covid Deaths")

#creating a double bar chart to compare male and female covid deaths throughout the years
covid%>%
  filter(Group=="By Year", Sex!="All Sexes")%>%
  ggplot(aes(x=Year, y=COVID.19.Deaths, fill=Sex))+
  geom_col(position=position_dodge(), stat="identity")+
  labs(title="Number of All Time Covid Deaths by Year",x="year", y="Number of Covid Deaths")